sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.5.1  magrittr_1.5    tools_3.5.1     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.0      stringi_1.2.4   rmarkdown_1.11 
##  [9] knitr_1.20      stringr_1.3.1   digest_0.6.18   evaluate_0.12

User Inputs

output.var = params$output.var
log.pred = params$log.pred
eda = params$eda
algo.forward = params$algo.forward
algo.backward = params$algo.backward
algo.stepwise = params$algo.stepwise
algo.LASSO = params$algo.LASSO
algo.LARS = params$algo.LARS

message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 8
##  $ output.var   : chr "y3"
##  $ log.pred     : logi TRUE
##  $ eda          : logi TRUE
##  $ algo.forward : logi FALSE
##  $ algo.backward: logi FALSE
##  $ algo.stepwise: logi FALSE
##  $ algo.LASSO   : logi FALSE
##  $ algo.LARS    : logi FALSE
# Setup Labels
# alt.scale.label.name = Alternate Scale variable name
#   - if predicting on log, then alt.scale is normal scale
#   - if predicting on normal scale, then alt.scale is log scale
if (log.pred == TRUE){
  label.names = paste('log.',output.var,sep="")
  alt.scale.label.name = output.var
}
if (log.pred == FALSE){
  label.names = output.var
  alt.scale.label.name = paste('log.',output.var,sep="")
}

Prepare Data

Read and Clean Features

features = read.csv("../../Data/features.csv")
#str(features) 

Checking correlations to evaluate removal of redundant features

corr.matrix = round(cor(features[sapply(features, is.numeric)]),2)

# filter out only highly correlated variables
threshold = 0.6
corr.matrix.tmp = corr.matrix
diag(corr.matrix.tmp) = 0
high.corr = apply(abs(corr.matrix.tmp) >= threshold, 1, any)
high.corr.matrix = corr.matrix.tmp[high.corr, high.corr]

DT::datatable(corr.matrix)
DT::datatable(high.corr.matrix)

Feature Names

feature.names = colnames(features)
drops <- c('JobName')
feature.names = feature.names[!(feature.names %in% drops)]
#str(feature.names)

Read and Clean Labels

labels = read.csv("../../Data/labels.csv")
#str(labels)
labels = labels[,c("JobName", output.var)]
summary(labels)
##       JobName           y3        
##  Job_00001:   1   Min.   : 95.91  
##  Job_00002:   1   1st Qu.:118.21  
##  Job_00003:   1   Median :123.99  
##  Job_00004:   1   Mean   :125.36  
##  Job_00005:   1   3rd Qu.:131.06  
##  Job_00006:   1   Max.   :193.73  
##  (Other)  :9994   NA's   :2497

Merge Datasets

data <- merge(features, labels, by = 'JobName')
drops <- c('JobName')
data = data[,(!colnames(data) %in% drops)]
#str(data)

Transformations

#str(data)
if (log.pred == TRUE){
  data[label.names] = log(data[alt.scale.label.name],10)
  
  drops = c(alt.scale.label.name)
  data = data[!(names(data) %in% drops)]
}
#str(data)

Remove NA Cases

data = data[complete.cases(data),]

Check correlation of Label with Featires

if (eda == TRUE){
  corr.to.label =round(cor(dplyr::select(data,-one_of(label.names)),dplyr::select_at(data,label.names)),4)
  DT::datatable(corr.to.label)
}

Multicollinearity - VIF

if (eda == TRUE){
  vifDF = usdm::vif(select_at(data,feature.names)) %>% arrange(desc(VIF))
  head(vifDF,10)
}
##    Variables      VIF
## 1     stat31 1.065342
## 2    stat202 1.063139
## 3    stat113 1.061198
## 4        x22 1.060427
## 5    stat200 1.060168
## 6         x6 1.058441
## 7     stat14 1.058339
## 8    stat147 1.058274
## 9    stat207 1.058175
## 10   stat215 1.058161

Exploratory Data Analysis

Scatterplots

panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
if (eda == TRUE){
  hist(data[ ,label.names])
  #hist(data[complete.cases(data),alt.scale.label.name])
}

# https://stackoverflow.com/questions/24648729/plot-one-numeric-variable-against-n-numeric-variables-in-n-plots
ind.pairs.plot <- function(data, xvars=NULL, yvar)
{
    df <- data
    if (is.null(xvars)) {
        xvars = names(data[which(names(data)!=yvar)])       
    }   

    #choose a format to display charts
    ncharts <- length(xvars) 
    
    for(i in 1:ncharts){    
        plot(df[,xvars[i]],df[,yvar], xlab = xvars[i], ylab = yvar)
    }
}

if (eda == TRUE){
  ind.pairs.plot(data, feature.names, label.names)
}

Feature Engineering

if(eda ==FALSE){
  # x18 may need transformations
  plot(data[,'x18'], data[,label.names], main = "Original Scatter Plot vs. x18", ylab = label.names, xlab = 'x18')
  plot(sqrt(data[,'x18']), data[,label.names], main = "Original Scatter Plot vs. sqrt(x18)", ylab = label.names, xlab = 'sqrt(x18)')
  
  # transforming x18
  data$sqrt.x18 = sqrt(data$x18)
  data = dplyr::select(data,-one_of('x18'))
  
  # what about x7, x9?
  # x11 looks like data is at discrete points after a while. Will this be a problem?
}

Modeling

Train Test Split

data = data[sample(nrow(data)),] # randomly shuffle data
split = sample.split(data[,label.names], SplitRatio = 0.8)

data.train = subset(data, split == TRUE)
data.test = subset(data, split == FALSE)

Common Functions

plot.diagnostics <-  function(model, train) {
  plot(model)
  
  residuals = resid(model) # Plotted above in plot(lm.out)
  r.standard = rstandard(model)
  r.student = rstudent(model)

  plot(predict(model,train),r.student,
      ylab="Student Residuals", xlab="Predicted Values", 
      main="Student Residual Plot") 
  abline(0, 0)
  
  plot(predict(model, train),r.standard,
      ylab="Standard Residuals", xlab="Predicted Values", 
      main="Standard Residual Plot") 
  abline(0, 0)
  abline(2, 0)
  abline(-2, 0)
  
  # Histogram
  hist(r.student, freq=FALSE, main="Distribution of Studentized Residuals", 
  xlab="Studentized Residuals", ylab="Density", ylim=c(0,0.5))

  # Create range of x-values for normal curve
  xfit <- seq(min(r.student)-1, max(r.student)+1, length=40)

  # Generate values from the normal distribution at the specified values
  yfit <- (dnorm(xfit))

  # Add the normal curve
  lines(xfit, yfit, ylim=c(0,0.5))
  
}

Setup Formulae

n <- names(data.train)
formula <- as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~", paste(n[!n %in% label.names], collapse = " + "))) 
grand.mean.formula = as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~ 1"))
print(formula)
## log.y3 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + 
##     x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + 
##     x22 + x23 + stat1 + stat2 + stat3 + stat4 + stat5 + stat6 + 
##     stat7 + stat8 + stat9 + stat10 + stat11 + stat12 + stat13 + 
##     stat14 + stat15 + stat16 + stat17 + stat18 + stat19 + stat20 + 
##     stat21 + stat22 + stat23 + stat24 + stat25 + stat26 + stat27 + 
##     stat28 + stat29 + stat30 + stat31 + stat32 + stat33 + stat34 + 
##     stat35 + stat36 + stat37 + stat38 + stat39 + stat40 + stat41 + 
##     stat42 + stat43 + stat44 + stat45 + stat46 + stat47 + stat48 + 
##     stat49 + stat50 + stat51 + stat52 + stat53 + stat54 + stat55 + 
##     stat56 + stat57 + stat58 + stat59 + stat60 + stat61 + stat62 + 
##     stat63 + stat64 + stat65 + stat66 + stat67 + stat68 + stat69 + 
##     stat70 + stat71 + stat72 + stat73 + stat74 + stat75 + stat76 + 
##     stat77 + stat78 + stat79 + stat80 + stat81 + stat82 + stat83 + 
##     stat84 + stat85 + stat86 + stat87 + stat88 + stat89 + stat90 + 
##     stat91 + stat92 + stat93 + stat94 + stat95 + stat96 + stat97 + 
##     stat98 + stat99 + stat100 + stat101 + stat102 + stat103 + 
##     stat104 + stat105 + stat106 + stat107 + stat108 + stat109 + 
##     stat110 + stat111 + stat112 + stat113 + stat114 + stat115 + 
##     stat116 + stat117 + stat118 + stat119 + stat120 + stat121 + 
##     stat122 + stat123 + stat124 + stat125 + stat126 + stat127 + 
##     stat128 + stat129 + stat130 + stat131 + stat132 + stat133 + 
##     stat134 + stat135 + stat136 + stat137 + stat138 + stat139 + 
##     stat140 + stat141 + stat142 + stat143 + stat144 + stat145 + 
##     stat146 + stat147 + stat148 + stat149 + stat150 + stat151 + 
##     stat152 + stat153 + stat154 + stat155 + stat156 + stat157 + 
##     stat158 + stat159 + stat160 + stat161 + stat162 + stat163 + 
##     stat164 + stat165 + stat166 + stat167 + stat168 + stat169 + 
##     stat170 + stat171 + stat172 + stat173 + stat174 + stat175 + 
##     stat176 + stat177 + stat178 + stat179 + stat180 + stat181 + 
##     stat182 + stat183 + stat184 + stat185 + stat186 + stat187 + 
##     stat188 + stat189 + stat190 + stat191 + stat192 + stat193 + 
##     stat194 + stat195 + stat196 + stat197 + stat198 + stat199 + 
##     stat200 + stat201 + stat202 + stat203 + stat204 + stat205 + 
##     stat206 + stat207 + stat208 + stat209 + stat210 + stat211 + 
##     stat212 + stat213 + stat214 + stat215 + stat216 + stat217
print(grand.mean.formula)
## log.y3 ~ 1

Full & Grand Means Model

model.full = lm(formula , data.train)
summary(model.full)
## 
## Call:
## lm(formula = formula, data = data.train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.084381 -0.021044 -0.004689  0.016465  0.186417 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.991e+00  9.252e-03 215.234  < 2e-16 ***
## x1          -1.778e-04  6.469e-04  -0.275 0.783506    
## x2           2.614e-04  4.135e-04   0.632 0.527339    
## x3           1.110e-04  1.128e-04   0.984 0.324925    
## x4          -3.884e-05  8.872e-06  -4.378 1.22e-05 ***
## x5           4.110e-04  2.911e-04   1.412 0.158109    
## x6           2.404e-04  5.871e-04   0.410 0.682156    
## x7           1.152e-02  6.300e-04  18.286  < 2e-16 ***
## x8           4.396e-04  1.469e-04   2.992 0.002783 ** 
## x9           3.101e-03  3.286e-04   9.438  < 2e-16 ***
## x10          1.453e-03  3.052e-04   4.762 1.97e-06 ***
## x11          1.939e+05  7.301e+04   2.655 0.007947 ** 
## x12         -8.972e-05  1.863e-04  -0.482 0.630030    
## x13          1.250e-04  7.421e-05   1.685 0.092098 .  
## x14         -5.789e-04  3.192e-04  -1.814 0.069744 .  
## x15         -5.426e-06  3.047e-04  -0.018 0.985794    
## x16          1.058e-03  2.100e-04   5.041 4.77e-07 ***
## x17          1.489e-03  3.209e-04   4.642 3.53e-06 ***
## x18          5.999e-03  2.241e-04  26.762  < 2e-16 ***
## x19          2.878e-04  1.642e-04   1.753 0.079583 .  
## x20         -5.318e-04  1.126e-03  -0.472 0.636727    
## x21          1.199e-04  4.180e-05   2.869 0.004138 ** 
## x22         -3.859e-04  3.404e-04  -1.133 0.257064    
## x23         -8.337e-05  3.221e-04  -0.259 0.795785    
## stat1       -3.616e-05  2.452e-04  -0.147 0.882788    
## stat2        1.547e-04  2.443e-04   0.633 0.526443    
## stat3        5.181e-04  2.460e-04   2.106 0.035229 *  
## stat4       -5.047e-04  2.461e-04  -2.051 0.040313 *  
## stat5       -1.668e-04  2.465e-04  -0.677 0.498537    
## stat6       -9.581e-05  2.455e-04  -0.390 0.696317    
## stat7       -9.974e-05  2.450e-04  -0.407 0.683933    
## stat8       -2.072e-04  2.453e-04  -0.845 0.398275    
## stat9       -1.237e-06  2.452e-04  -0.005 0.995977    
## stat10      -2.820e-04  2.442e-04  -1.155 0.248218    
## stat11      -1.236e-04  2.463e-04  -0.502 0.615809    
## stat12       2.051e-04  2.445e-04   0.839 0.401609    
## stat13      -2.999e-04  2.442e-04  -1.228 0.219375    
## stat14      -8.128e-04  2.435e-04  -3.338 0.000849 ***
## stat15      -2.859e-04  2.421e-04  -1.181 0.237629    
## stat16       1.428e-04  2.448e-04   0.584 0.559522    
## stat17       5.120e-05  2.425e-04   0.211 0.832766    
## stat18      -2.317e-04  2.438e-04  -0.950 0.342064    
## stat19       2.470e-04  2.448e-04   1.009 0.313029    
## stat20      -1.184e-04  2.441e-04  -0.485 0.627551    
## stat21       2.964e-05  2.454e-04   0.121 0.903868    
## stat22      -3.191e-04  2.463e-04  -1.295 0.195302    
## stat23       6.710e-04  2.435e-04   2.756 0.005867 ** 
## stat24      -3.367e-04  2.447e-04  -1.376 0.168807    
## stat25      -3.900e-04  2.449e-04  -1.592 0.111381    
## stat26      -3.175e-04  2.453e-04  -1.295 0.195512    
## stat27       1.809e-04  2.446e-04   0.740 0.459504    
## stat28       7.814e-06  2.458e-04   0.032 0.974644    
## stat29       2.682e-04  2.454e-04   1.093 0.274619    
## stat30       1.192e-04  2.473e-04   0.482 0.629857    
## stat31      -9.795e-05  2.464e-04  -0.397 0.691025    
## stat32       8.302e-05  2.477e-04   0.335 0.737486    
## stat33      -2.597e-04  2.443e-04  -1.063 0.287729    
## stat34       1.751e-04  2.450e-04   0.715 0.474816    
## stat35      -5.028e-04  2.458e-04  -2.045 0.040876 *  
## stat36       1.090e-04  2.436e-04   0.447 0.654641    
## stat37      -4.658e-04  2.476e-04  -1.881 0.060018 .  
## stat38       2.904e-04  2.455e-04   1.183 0.236955    
## stat39      -1.397e-04  2.433e-04  -0.574 0.565931    
## stat40      -1.952e-04  2.444e-04  -0.799 0.424550    
## stat41      -3.806e-04  2.428e-04  -1.567 0.117124    
## stat42      -4.331e-04  2.437e-04  -1.777 0.075596 .  
## stat43      -2.733e-04  2.454e-04  -1.114 0.265454    
## stat44       8.283e-05  2.442e-04   0.339 0.734518    
## stat45      -5.279e-04  2.442e-04  -2.162 0.030658 *  
## stat46       2.831e-04  2.459e-04   1.151 0.249784    
## stat47       1.648e-04  2.466e-04   0.668 0.503896    
## stat48       4.257e-04  2.447e-04   1.740 0.081974 .  
## stat49       3.364e-05  2.425e-04   0.139 0.889691    
## stat50       3.425e-04  2.436e-04   1.406 0.159729    
## stat51       3.250e-04  2.453e-04   1.325 0.185254    
## stat52      -1.063e-04  2.460e-04  -0.432 0.665749    
## stat53      -2.931e-04  2.461e-04  -1.191 0.233810    
## stat54      -4.229e-04  2.463e-04  -1.717 0.086010 .  
## stat55       2.602e-04  2.442e-04   1.066 0.286641    
## stat56      -3.205e-04  2.448e-04  -1.309 0.190476    
## stat57      -9.730e-05  2.420e-04  -0.402 0.687712    
## stat58      -8.365e-05  2.436e-04  -0.343 0.731293    
## stat59       2.193e-04  2.441e-04   0.899 0.368894    
## stat60       3.568e-04  2.443e-04   1.460 0.144274    
## stat61      -9.795e-05  2.453e-04  -0.399 0.689645    
## stat62      -4.678e-05  2.450e-04  -0.191 0.848558    
## stat63       3.456e-04  2.446e-04   1.413 0.157699    
## stat64      -2.393e-04  2.425e-04  -0.987 0.323730    
## stat65      -1.148e-04  2.455e-04  -0.468 0.639990    
## stat66       1.280e-04  2.472e-04   0.518 0.604550    
## stat67       1.234e-04  2.457e-04   0.502 0.615349    
## stat68      -2.778e-04  2.457e-04  -1.131 0.258238    
## stat69      -1.276e-04  2.452e-04  -0.520 0.602904    
## stat70       3.471e-04  2.440e-04   1.423 0.154907    
## stat71       7.451e-05  2.438e-04   0.306 0.759930    
## stat72       8.108e-05  2.469e-04   0.328 0.742584    
## stat73       4.192e-04  2.471e-04   1.697 0.089784 .  
## stat74       1.359e-05  2.456e-04   0.055 0.955870    
## stat75      -3.129e-04  2.467e-04  -1.268 0.204672    
## stat76       7.657e-05  2.447e-04   0.313 0.754379    
## stat77      -2.668e-04  2.465e-04  -1.083 0.279050    
## stat78      -1.476e-04  2.467e-04  -0.598 0.549658    
## stat79       6.783e-05  2.452e-04   0.277 0.782057    
## stat80       1.591e-04  2.471e-04   0.644 0.519714    
## stat81       4.252e-04  2.455e-04   1.732 0.083361 .  
## stat82       2.242e-04  2.436e-04   0.920 0.357475    
## stat83      -1.917e-04  2.450e-04  -0.782 0.434049    
## stat84      -1.487e-04  2.442e-04  -0.609 0.542653    
## stat85       1.147e-04  2.460e-04   0.466 0.640967    
## stat86      -1.590e-05  2.446e-04  -0.065 0.948181    
## stat87      -5.559e-04  2.453e-04  -2.266 0.023473 *  
## stat88      -1.080e-04  2.428e-04  -0.445 0.656556    
## stat89      -2.288e-04  2.447e-04  -0.935 0.349868    
## stat90      -2.530e-04  2.450e-04  -1.033 0.301787    
## stat91      -2.605e-04  2.442e-04  -1.067 0.286126    
## stat92      -3.101e-04  2.453e-04  -1.264 0.206123    
## stat93      -1.521e-04  2.469e-04  -0.616 0.538046    
## stat94      -3.399e-04  2.457e-04  -1.383 0.166615    
## stat95       4.036e-05  2.455e-04   0.164 0.869424    
## stat96      -9.747e-05  2.448e-04  -0.398 0.690552    
## stat97       2.186e-04  2.430e-04   0.900 0.368284    
## stat98       3.444e-03  2.425e-04  14.206  < 2e-16 ***
## stat99       1.924e-04  2.457e-04   0.783 0.433711    
## stat100      7.978e-04  2.447e-04   3.261 0.001118 ** 
## stat101      5.535e-05  2.465e-04   0.225 0.822319    
## stat102     -1.265e-04  2.449e-04  -0.516 0.605653    
## stat103     -3.548e-04  2.482e-04  -1.430 0.152884    
## stat104     -2.746e-04  2.441e-04  -1.125 0.260521    
## stat105      9.914e-05  2.435e-04   0.407 0.683936    
## stat106     -2.415e-04  2.442e-04  -0.989 0.322719    
## stat107     -1.933e-04  2.458e-04  -0.787 0.431552    
## stat108     -2.947e-04  2.459e-04  -1.199 0.230766    
## stat109      6.519e-06  2.436e-04   0.027 0.978648    
## stat110     -3.554e-03  2.432e-04 -14.615  < 2e-16 ***
## stat111     -1.203e-04  2.463e-04  -0.488 0.625270    
## stat112      5.211e-06  2.462e-04   0.021 0.983116    
## stat113     -1.843e-04  2.457e-04  -0.750 0.453115    
## stat114      1.618e-04  2.433e-04   0.665 0.506012    
## stat115     -8.573e-05  2.435e-04  -0.352 0.724803    
## stat116      3.732e-04  2.459e-04   1.518 0.129032    
## stat117     -1.973e-07  2.461e-04  -0.001 0.999361    
## stat118     -2.574e-04  2.446e-04  -1.052 0.292629    
## stat119      3.944e-06  2.439e-04   0.016 0.987099    
## stat120     -1.290e-04  2.430e-04  -0.531 0.595552    
## stat121     -3.766e-04  2.454e-04  -1.535 0.124889    
## stat122     -5.106e-05  2.431e-04  -0.210 0.833681    
## stat123      1.794e-05  2.476e-04   0.072 0.942238    
## stat124     -2.360e-04  2.451e-04  -0.963 0.335494    
## stat125      5.225e-05  2.463e-04   0.212 0.831986    
## stat126      1.696e-04  2.450e-04   0.692 0.488778    
## stat127      7.369e-05  2.440e-04   0.302 0.762696    
## stat128     -1.044e-06  2.449e-04  -0.004 0.996601    
## stat129      9.258e-05  2.441e-04   0.379 0.704508    
## stat130      1.269e-04  2.449e-04   0.518 0.604370    
## stat131      1.659e-04  2.464e-04   0.673 0.500674    
## stat132     -1.127e-04  2.432e-04  -0.463 0.643158    
## stat133      1.069e-04  2.453e-04   0.436 0.663047    
## stat134     -2.836e-04  2.446e-04  -1.160 0.246267    
## stat135     -8.239e-05  2.448e-04  -0.337 0.736432    
## stat136     -1.735e-04  2.452e-04  -0.707 0.479297    
## stat137     -3.669e-05  2.427e-04  -0.151 0.879881    
## stat138     -7.098e-05  2.445e-04  -0.290 0.771603    
## stat139      2.190e-05  2.458e-04   0.089 0.929023    
## stat140     -2.140e-05  2.430e-04  -0.088 0.929833    
## stat141      3.770e-04  2.434e-04   1.549 0.121385    
## stat142     -3.961e-05  2.482e-04  -0.160 0.873194    
## stat143      2.285e-04  2.450e-04   0.932 0.351159    
## stat144      4.989e-04  2.441e-04   2.044 0.041002 *  
## stat145      1.275e-04  2.471e-04   0.516 0.605914    
## stat146     -5.567e-04  2.469e-04  -2.254 0.024211 *  
## stat147     -3.076e-04  2.466e-04  -1.247 0.212285    
## stat148     -2.241e-04  2.428e-04  -0.923 0.356072    
## stat149     -4.686e-04  2.469e-04  -1.898 0.057735 .  
## stat150      4.899e-05  2.456e-04   0.199 0.841920    
## stat151     -1.836e-04  2.461e-04  -0.746 0.455786    
## stat152     -1.782e-04  2.452e-04  -0.727 0.467433    
## stat153      2.387e-04  2.488e-04   0.959 0.337350    
## stat154      7.632e-05  2.478e-04   0.308 0.758114    
## stat155     -8.946e-05  2.442e-04  -0.366 0.714163    
## stat156      1.649e-04  2.462e-04   0.670 0.503146    
## stat157     -7.952e-05  2.428e-04  -0.327 0.743304    
## stat158     -2.051e-04  2.474e-04  -0.829 0.407018    
## stat159     -8.435e-05  2.435e-04  -0.346 0.729081    
## stat160     -9.287e-06  2.457e-04  -0.038 0.969846    
## stat161      4.286e-04  2.460e-04   1.743 0.081435 .  
## stat162      8.260e-05  2.428e-04   0.340 0.733692    
## stat163      1.294e-04  2.479e-04   0.522 0.601796    
## stat164      3.002e-04  2.457e-04   1.222 0.221927    
## stat165     -9.308e-05  2.448e-04  -0.380 0.703747    
## stat166     -4.031e-04  2.432e-04  -1.657 0.097529 .  
## stat167     -2.805e-04  2.452e-04  -1.144 0.252718    
## stat168     -2.205e-04  2.437e-04  -0.905 0.365682    
## stat169      4.275e-05  2.449e-04   0.175 0.861424    
## stat170     -7.808e-05  2.448e-04  -0.319 0.749716    
## stat171      2.141e-04  2.455e-04   0.872 0.383199    
## stat172      4.107e-04  2.448e-04   1.677 0.093532 .  
## stat173     -1.353e-04  2.453e-04  -0.552 0.581171    
## stat174     -1.817e-04  2.456e-04  -0.740 0.459444    
## stat175     -3.439e-04  2.455e-04  -1.401 0.161270    
## stat176      6.551e-05  2.447e-04   0.268 0.788935    
## stat177      3.197e-05  2.454e-04   0.130 0.896358    
## stat178     -1.724e-04  2.467e-04  -0.699 0.484713    
## stat179      9.777e-05  2.444e-04   0.400 0.689187    
## stat180      1.523e-04  2.443e-04   0.623 0.533058    
## stat181      2.416e-04  2.453e-04   0.985 0.324708    
## stat182     -7.181e-05  2.455e-04  -0.292 0.769924    
## stat183      2.171e-04  2.447e-04   0.887 0.375002    
## stat184      1.533e-04  2.476e-04   0.619 0.536013    
## stat185     -1.363e-04  2.410e-04  -0.565 0.571817    
## stat186     -2.592e-04  2.466e-04  -1.051 0.293230    
## stat187     -6.975e-04  2.435e-04  -2.864 0.004193 ** 
## stat188     -1.985e-04  2.438e-04  -0.814 0.415550    
## stat189      1.749e-04  2.459e-04   0.711 0.476924    
## stat190      2.842e-05  2.447e-04   0.116 0.907539    
## stat191     -2.759e-04  2.448e-04  -1.127 0.259675    
## stat192      9.705e-05  2.460e-04   0.395 0.693148    
## stat193     -1.945e-04  2.476e-04  -0.786 0.432181    
## stat194     -1.809e-04  2.436e-04  -0.743 0.457775    
## stat195      4.676e-04  2.447e-04   1.911 0.056083 .  
## stat196     -3.594e-04  2.484e-04  -1.447 0.147945    
## stat197      1.132e-04  2.420e-04   0.468 0.639966    
## stat198     -5.725e-04  2.454e-04  -2.333 0.019705 *  
## stat199      3.342e-04  2.438e-04   1.371 0.170428    
## stat200     -5.132e-04  2.424e-04  -2.117 0.034294 *  
## stat201      4.143e-05  2.452e-04   0.169 0.865836    
## stat202     -2.010e-04  2.468e-04  -0.814 0.415410    
## stat203     -5.567e-05  2.447e-04  -0.228 0.820040    
## stat204     -4.042e-04  2.440e-04  -1.657 0.097590 .  
## stat205     -2.542e-04  2.436e-04  -1.044 0.296711    
## stat206     -2.263e-05  2.469e-04  -0.092 0.926966    
## stat207      4.118e-04  2.439e-04   1.689 0.091310 .  
## stat208      2.396e-05  2.445e-04   0.098 0.921939    
## stat209      1.889e-04  2.431e-04   0.777 0.437132    
## stat210     -2.048e-04  2.467e-04  -0.830 0.406379    
## stat211     -4.131e-05  2.448e-04  -0.169 0.866021    
## stat212     -1.460e-04  2.450e-04  -0.596 0.551172    
## stat213     -3.959e-05  2.464e-04  -0.161 0.872353    
## stat214     -3.346e-04  2.443e-04  -1.370 0.170853    
## stat215     -4.004e-04  2.452e-04  -1.633 0.102544    
## stat216     -1.404e-04  2.465e-04  -0.570 0.568930    
## stat217      1.454e-04  2.462e-04   0.591 0.554785    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0322 on 5761 degrees of freedom
## Multiple R-squared:  0.2554, Adjusted R-squared:  0.2243 
## F-statistic: 8.232 on 240 and 5761 DF,  p-value: < 2.2e-16
plot.diagnostics(model.full, data.train)

model.null = lm(grand.mean.formula, data.train)
summary(model.null)
## 
## Call:
## lm(formula = grand.mean.formula, data = data.train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.115119 -0.024167 -0.003381  0.020592  0.190194 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.096995   0.000472    4443   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03656 on 6001 degrees of freedom
plot.diagnostics(model.null, data.train)

## hat values (leverages) are all = 0.0001666111
##  and there are no factor predictors; no plot no. 5

Variable Selection

http://www.stat.columbia.edu/~martin/W2024/R10.pdf

Forward Selection

if (algo.forward == TRUE){
  t1 = Sys.time()
  
  model.forward = step(model.null, scope=list(lower=model.null, upper=model.full), direction="forward")
  summary(model.forward)
  
  t2 = Sys.time()
  print (paste("Time taken for Forward Selection: ",t2-t1, sep = ""))
}

Backward Elimination

if (algo.backward == TRUE){
  # Takes too much time
  t1 = Sys.time()
  
  model.backward = step(model.full, data = data.train, direction="backward")
  summary(model.backward)
  
  t2 = Sys.time()
  print (paste("Time taken for Backward Elimination: ",t2-t1, sep = ""))
}

Stepwise Selection

if (algo.stepwise == TRUE){
  t1 = Sys.time()
  
  model.stepwise = step(model.null, scope=list(upper=model.full), data = data.train, direction="both")
  summary(model.stepwise)
  
  t2 = Sys.time()
  print (paste("Time taken for Stepwise Selection: ",t2-t1, sep = ""))
}

LASSO Selection

if (algo.LASSO == TRUE){
  t1 = Sys.time()

  model.LASSO = cv.glmnet(as.matrix(data.train[,feature.names]), data.train[,label.names], nfolds = 5, standardize = TRUE)  
  summary(model.LASSO)
  
  t2 = Sys.time()
  print (paste("Time taken for LASSO: ",t2-t1, sep = ""))
  
  plot(model.LASSO)
  best_lambda = model.LASSO$lambda.1se
  lasso_coef = model.LASSO$glmnet.fit$beta[ , model.LASSO$glmnet.fit$lambda == best_lambda]
  print (lasso_coef)
  lasso_coef [ abs(lasso_coef) > 0 ]
}
# summary(model.forward)
# summary(model.stepwise)